Part I: Data Exploration¶

We need to explore the dataset, find relationship between feature, discover how we should format or modify the dataset for the prediction analysis.

In [123]:
import numpy as np
import pandas
import seaborn
from matplotlib import pyplot as plt
In [124]:
# Load dataset
data = pandas.read_csv('AmesHousing.csv', index_col="Order")

# Display dataset
data
Out[124]:
PID MS SubClass MS Zoning Lot Frontage Lot Area Street Alley Lot Shape Land Contour Utilities ... Pool Area Pool QC Fence Misc Feature Misc Val Mo Sold Yr Sold Sale Type Sale Condition SalePrice
Order
1 526301100 20 RL 141.0 31770 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN NaN 0 5 2010 WD Normal 215000
2 526350040 20 RH 80.0 11622 Pave NaN Reg Lvl AllPub ... 0 NaN MnPrv NaN 0 6 2010 WD Normal 105000
3 526351010 20 RL 81.0 14267 Pave NaN IR1 Lvl AllPub ... 0 NaN NaN Gar2 12500 6 2010 WD Normal 172000
4 526353030 20 RL 93.0 11160 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 4 2010 WD Normal 244000
5 527105010 60 RL 74.0 13830 Pave NaN IR1 Lvl AllPub ... 0 NaN MnPrv NaN 0 3 2010 WD Normal 189900
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2926 923275080 80 RL 37.0 7937 Pave NaN IR1 Lvl AllPub ... 0 NaN GdPrv NaN 0 3 2006 WD Normal 142500
2927 923276100 20 RL NaN 8885 Pave NaN IR1 Low AllPub ... 0 NaN MnPrv NaN 0 6 2006 WD Normal 131000
2928 923400125 85 RL 62.0 10441 Pave NaN Reg Lvl AllPub ... 0 NaN MnPrv Shed 700 7 2006 WD Normal 132000
2929 924100070 20 RL 77.0 10010 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 4 2006 WD Normal 170000
2930 924151050 60 RL 74.0 9627 Pave NaN Reg Lvl AllPub ... 0 NaN NaN NaN 0 11 2006 WD Normal 188000

2930 rows × 81 columns

In [125]:
# Function to map categorical features to int
def generate_ordinal_encoding(column):
    values = column.unique()
    mapping = {i: values[i] for i in range(len(values))}
    return mapping

def generate_ordinal_encoding_for_all_columns(dataset):
    encoding = {}
    for column in dataset.columns:
        if dataset[column].dtype == object:
            encoding[column] = generate_ordinal_encoding(dataset[column])
    return encoding

def apply_ordinal_encoding_to_all_columns(dataset, columns_dict):
    for column in columns_dict:
        dataset[column] = dataset[column].map({v: k for k, v in columns_dict[column].items()})

def apply_onehot_encoding(column):
    encoding = {}
    for feature in column.unique():
        encoding[feature] = column.apply(lambda x: x == feature)
    return pandas.DataFrame(encoding, index=column.index)

# Function to draw correlation matrix
def draw_correlation_matrix(dataset, figsize=None, title="Correlation Heatmap"):
    if figsize:
        plt.figure(figsize=figsize)
    seaborn.heatmap(dataset.corr(), annot=True, fmt='.2f', cmap='Pastel2', linewidths=2)
    plt.title(title)
    plt.show()
In [126]:
# Draw correlation matrix for numerical features
draw_correlation_matrix(data.select_dtypes(include=[np.number]), figsize=(30, 20))
No description has been provided for this image
In [129]:
# Draw correlation matrix for all columns with categorical features
for column in data.columns:
    if data[column].dtype == object:
        encoding = apply_onehot_encoding(data[column])
        encoding["SalePrice"] = data["SalePrice"]
        draw_correlation_matrix(encoding, figsize=(20, 15), title="Correlation Heatmap of " + column)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image